--- title: "[old portfolio] Kaggle Xgboost" date: 2020-01-01 00:00:00 +0900 categories: jekyll update ---
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
os.getcwd()
os.chdir('./competitive-data-science-predict-future-sales')
cat = pd.read_csv('item_categories.csv')
items = pd.read_csv('items.csv')
shops = pd.read_csv('shops.csv')
cat.tail()
items.tail()
shops.tail()
train = pd.read_csv('sales_train_v2.csv')
train.dropna(how='any'); # there seems to be no missing data
train.head()
train[(train.date=='05.01.2013') & (train.item_id==2552)]
train['item_balance']=train['item_price']*train['item_cnt_day']
train.head()
train.loc[:,['date_block_num','item_cnt_day','item_balance']].groupby(['date_block_num']).sum().head()
plt.rcParams['font.size'] = 15
fig, ax1=plt.subplots(figsize=(12,6))
ax2=ax1.twinx()
color = (0,0,1)
train.loc[:,['date_block_num','item_cnt_day']].groupby(['date_block_num']).sum().plot(fig=fig,ax=ax1,color=color);
ax1.set_ylabel('Sold Items',color=color)
ax1.set_xlabel('Months',color=(0,0,0))
ax1.tick_params(axis='y', labelcolor=color)
ax1.set_ylim((0,2e5))
color = 'tab:red'
train.loc[:,['date_block_num','item_balance']].groupby(['date_block_num']).sum().plot(fig=fig,ax=ax2,color=color);
ax2.set_ylabel('Balance',color=color)
ax2.tick_params(axis='y', labelcolor=color)
ax2.set_ylim((0,2.5e8))
ax1.legend().set_visible(False)
ax2.legend().set_visible(False)
train[(train.date_block_num==11) | (train.date_block_num==23)]
cnt=train.loc[:,['date_block_num','item_cnt_day']].groupby(['date_block_num']).sum().to_numpy().reshape(-1)
bln=train.loc[:,['date_block_num','item_balance']].groupby(['date_block_num']).sum().to_numpy().reshape(-1)
def PSD(X):
return np.square(np.abs(np.fft.fft(X)))
cnt.shape
fig, ax = plt.subplots(1,2,figsize=(12,4));
f = np.linspace(0, 0.5, int(len(cnt)/2))
ax[0].plot(f,PSD(cnt)[0:17]);
ax[0].set_ylim(0,2e11);
ax[0].set_title('Item count')
ax[0].set_xlabel('frequency (Month)')
ax[0].axvline(x=1/12,linewidth=4, color='r')
fig.text(0.15, .70, r'$T = 1$ year', {'color': 'r', 'fontsize': 20})
ax[1].plot(f,PSD(bln)[0:17]);
ax[1].set_ylim(0,3e17);
ax[1].set_title('Balance')
ax[1].set_xlabel('frequency (Month)')
ax[1].axvline(x=1/12,linewidth=4, color='r')
plt.tight_layout()
fig.suptitle('Power Spectral Density',position=(0.5,1.05),fontsize=20);
https://www.itp.tu-berlin.de/fileadmin/a3233/grk/pototskyLectures2012/pototsky_lectures_part1.pdf
def ACF(X):
return np.fft.fftshift(np.fft.ifft(np.square(np.abs(np.fft.fft(X)))))
def nACF(X): #Normalized Autocorrelation
jp_=np.square(np.mean(X))
pj_=np.mean(np.square(X))
return (ACF(X)/len(X)-jp_)/(pj_-jp_)
fig, ax = plt.subplots(1,2,figsize=(12,4));
#f = np.linspace(0, len(cnt)[0], 1)
ax[0].plot(nACF(cnt)[17:]);
ax[0].set_title('Item count')
ax[0].set_xlabel(r'$\tau$'+' (Month)')
ax[0].axvline(x=12,linewidth=4, color='r')
fig.text(00.25, .70, r'$\tau = 1$ year', {'color': 'r', 'fontsize': 20})
ax[1].plot(nACF(bln)[17:]);
ax[1].set_title('Balance')
ax[1].set_xlabel(r'$\tau$'+' (Month)')
ax[1].axvline(x=12,linewidth=4, color='r')
plt.tight_layout()
fig.suptitle('Autocorrelation',position=(0.5,1.05),fontsize=20);
test = pd.read_csv('test.csv')
test.head(6)
train[(train.shop_id==5) & (train.item_id==5037)]
plt.scatter(train[(train.shop_id==5) & (train.item_id==5037)]['date_block_num'],train[(train.shop_id==5) & (train.item_id==5037)]['item_cnt_day']);
plt.xlabel('month')
plt.ylabel('items sold')
plt.title('shop 5, item 5037');
grouped=train.groupby(['shop_id','item_id','date_block_num'])
grouped.first()
grouped.get_group((0,30,1))['item_cnt_day'].sum()
groupsum=grouped.sum()
groupsum.groupby('date_block_num').tail(100)
ledger=groupsum['item_cnt_day']
train[(train.shop_id==0) & (train.item_id==30)]
shop_performance=np.zeros((60,34))
for s in range(60):
for i in range(34):
shop_performance[s][i]=train[(train.shop_id==s) & (train.date_block_num==i)].sum()['item_cnt_day']
if s%10==0 and i==0:
print("{0}/60".format(s))
fig, ax = plt.subplots(10,6,figsize=(50,30));
iter=0
for ax1 in ax:
for axes in ax1:
axes.plot(shop_performance[iter])
iter+=1
plt.tight_layout()
from sklearn.linear_model import LinearRegression
def regfunc(X):
# x=[list(l) for l in zip(range(np.shape(X)[0]),X)]
reg=LinearRegression().fit(np.reshape(range(np.shape(X)[0]),(-1,1)), X)
return [reg.coef_[0],reg.intercept_]
fig, ax = plt.subplots(1,3,figsize=(12,4));
months=len(shop_performance[7])
ax[0].plot(shop_performance[7])
ax[0].set_ylim(0)
ax[0].set_title('linear regression')
a,b=regfunc(shop_performance[7])
ax[0].plot(range(months)*a+b)
ax[1].plot(shop_performance[7]-(range(months)*a+b))
ax[1].set_title('$\epsilon$');
ax[2].plot((shop_performance[7]-(range(months)*a+b))[:12])
ax[2].plot((shop_performance[7]-(range(months)*a+b))[12:24])
ax[2].plot((shop_performance[7]-(range(months)*a+b))[24:])
ax[2].set_title('$\epsilon$-yearly');
fig.suptitle('Monthly performance of shop 7',position=(0.5,1.05), fontsize=20)
plt.tight_layout()
deadshop=[]
iter=0
for sales in shop_performance:
if sales[33]==0:
print(iter,end=' ')
deadshop.append(iter)
iter+=1
def pred_nextmonth(X):
if X[-1]==0:
return 0 # dead shop sells zero
l=len(X)
a,b=regfunc(X)
e=X-(a*range(l)+b)
n=[]
for it in range(int((l-1)/12)):
n.append(e[l-12*it-12])
if len(n)==0:
m=0
else:
m=np.mean(n)
return np.max([a*l+b+m,0]) #linear regression + epsilon(mean of the sales of the same month from prior years)
def month_append(X):
return np.append(X,pred_nextmonth(X))
A=month_append(shop_performance[7])
for i in range(12):
A=month_append(A)
fig, ax = plt.subplots(1,3,figsize=(12,4));
months=len(A)
ax[0].plot(A)
ax[0].set_ylim(0)
ax[0].set_title('linear regression')
a,b=regfunc(A)
ax[0].plot(range(months)*a+b)
ax[1].plot(A-(range(months)*a+b))
ax[1].set_title('$\epsilon$');
ax[2].plot((A-(range(months)*a+b))[:12])
ax[2].plot((A-(range(months)*a+b))[12:24])
ax[2].plot((A-(range(months)*a+b))[24:36])
ax[2].plot((A-(range(months)*a+b))[36:])
ax[2].set_title('$\epsilon$-yearly');
fig.suptitle('Next yearly predictions of shop 7',position=(0.5,1.05), fontsize=20)
plt.tight_layout()
shop_prediction=[]
for perf in shop_performance:
for i in range(12):
perf=month_append(perf)
shop_prediction.append(perf)
fig, ax = plt.subplots(10,6,figsize=(50,30));
iter=0
for ax1 in ax:
for axes in ax1:
axes.plot(shop_prediction[iter])
axes.axvspan(34, len(shop_prediction[iter])-1, facecolor='r', alpha=0.2)
iter+=1
fig.suptitle('predictions of sales for the next year for each shop',position=(0.5,1.01),fontsize=30)
plt.tight_layout()
cat_list=items['item_category_id'].to_numpy()
train['item_categories']=cat_list[train['item_id']]
train.head()
cat_performance=np.zeros((84,34))
for s in range(84):
for i in range(34):
cat_performance[s][i]=train[(train.item_categories==s) & (train.date_block_num==i)].sum()['item_cnt_day']
if s%10==0 and i==0:
print("{0}/84".format(s))
fig, ax = plt.subplots(12,7,figsize=(50,40));
iter=0
for ax1 in ax:
for axes in ax1:
axes.plot(cat_performance[iter])
iter+=1
plt.tight_layout()
deadcat=[]
iter=0
for sales in cat_performance:
if sales[33]==0:
print(iter,end=' ')
deadcat.append(iter)
iter+=1
cat_prediction=[]
for perf in cat_performance:
for i in range(12):
perf=month_append(perf)
cat_prediction.append(perf)
fig, ax = plt.subplots(10,6,figsize=(50,30));
iter=0
for ax1 in ax:
for axes in ax1:
axes.plot(cat_prediction[iter])
axes.axvspan(34, len(cat_prediction[iter])-1, facecolor='r', alpha=0.2)
iter+=1
fig.suptitle('predictions for the next year for each category of items',position=(0.5,1.01),fontsize=30)
plt.tight_layout()
groupsum=groupsum.reset_index()
npmatrix=groupsum[['shop_id','item_id','date_block_num']].to_numpy()
npmatrix
items['item_category_id']
from tqdm import tqdm, tqdm_notebook
newmatrix=[]
iterationcounter=0
for i in tqdm_notebook(npmatrix):
i=np.append(i, month_append(shop_performance[i[0]][:i[2]+1])[-1]) #shop sales prediction of next month
category=items['item_category_id'][i[1]]
month_append(cat_performance[category])
i=np.append(i, month_append(cat_performance[category][:int(i[2]+1)])[-1]) #category sales prediction of next month
newmatrix.append(i)
if (iterationcounter%50000)==0:
print('{0:07d} / {1:07d}\r'.format(iterationcounter,len(npmatrix)))
iterationcounter+=1
pd.DataFrame(newmatrix).to_csv("newmatrix.csv")
newmatrix=np.asarray(newmatrix)
groupsum['shop_pred']=newmatrix[:,3]
groupsum['cat_pred']=newmatrix[:,4]
entry=[]
a=groupsum['shop_id']
b=groupsum['date_block_num']
c=groupsum['item_id']
d=len(groupsum)
for i in range(d-1):
#print(a[i],c[i],b[i]+1)
if a[i+1]==a[i] and b[i+1]==b[i]+1 and c[i+1]==c[i]:
#entry.append(groupsum[(groupsum.shop_id==a[i]) & (groupsum.item_id==c[i]) & (groupsum.date_block_num==(b[i]+1))].sum()['item_cnt_day'])
entry.append(groupsum.loc[i+1,'item_cnt_day'])
else:
entry.append(0)
if (i%50000)==0:
print('{0:07d} / {1:07d}\r'.format(i,d))
# print(entry)
entry.append(0)
groupsum['true_nxt_month']=entry
groupsum.to_csv('traindata.csv')
groupsum=groupsum.drop('item_price',axis=1)
groupsum.tail(10)
traingroup=groupsum[groupsum['date_block_num']<groupsum['date_block_num'].unique().max()] ##cannot train with the last month because we don't know the ground truth for the future
print(len(traingroup)/len(groupsum)) #about 2% of data dropped
traingroup.tail()
lastmonth=groupsum[groupsum['date_block_num']==groupsum['date_block_num'].unique().max()]
lastmonth.tail()
tlen=len(test)
test['date_block_num']=np.ones(tlen)*groupsum['date_block_num'].unique().max()
traingroup=traingroup.drop('item_balance',axis=1)
traingroup.tail()
a=test.iloc[:,1:3].to_numpy()[:,0]
b=test.iloc[:,1:3].to_numpy()[:,1]
countvector=np.zeros(tlen)
for i in range(tlen):
countvector[i]=lastmonth[(lastmonth['shop_id']==a[i]) & (lastmonth['item_id']==b[i])].item_cnt_day.sum()
if (i%50000)==0:
print('{0:07d} / {1:07d}\r'.format(i,tlen))
test['item_cnt_day']=countvector
test.tail()
shop_p=[]
for perf in shop_performance:
perf=month_append(perf)
shop_p.append(perf[-1])
cat_p=[]
for perf in cat_performance:
cat_p.append(pred_nextmonth(perf))
shopv=[]
catv=[]
catindex=items['item_category_id'].to_numpy()
for shops in test.shop_id.to_numpy():
shopv.append(shop_p[shops])
for itm in test.item_id.to_numpy():
catv.append(cat_p[catindex[itm]])
test['shop_pred']=shopv
test['cat_pred']=catv
test.head()
traingroup.to_csv('traindata.csv')
test.to_csv('testdata.csv')
traingroup=pd.read_csv('traindata.csv')
test=pd.read_csv('testdata.csv')
traingroup.head()
import xgboost as xgb
from sklearn.metrics import mean_squared_error
X, y = traingroup.iloc[:,1:-1],traingroup.iloc[:,-1]
data_dmatrix = xgb.DMatrix(data=X,label=y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
a=[0.01, 0.02, 0.05, 0.1, 0.2, 0.5,1,1.5]
for i in a:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = i, max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("Learning_rate: "+str(i)+" RMSE: %f" % (rmse))
a=[1,2,5,10,20,50,100]
for i in a:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.5, max_depth = 5, alpha = 10, n_estimators = i)
xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("Trees: "+"{0:03d}".format(i)+" RMSE: %f" % (rmse))
a=[3,4,5,6,7]
for i in a:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.5, max_depth = i, alpha = 10, n_estimators = 10)
xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("Depth: "+"{0:02d}".format(i)+" RMSE: %f" % (rmse))
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.5, max_depth = 10, alpha = 10, n_estimators = 100)
xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))
params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
'max_depth': 5, 'alpha': 10}
cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=10,
num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)
cv_results.tail(10) #cross validation stays stable at each iteration
xgb.plot_importance(xg_reg)
plt.rcParams['figure.figsize'] = [20, 10]
plt.show()
result = xg_reg.predict(test.iloc[:,2:])
result
plt.rcParams['font.size'] = 15
plt.hist(result,range=(0,50),bins=50);
result=np.asarray(result)
result=result*(result>0)
result=result*(result<20)+(result>=20)*20
plt.hist(result,range=(0,50),bins=50);
submission=pd.read_csv('sample_submission.csv')
submission['item_cnt_month']=result
submission.to_csv('my_submission.csv',index=False)